rm(range)

library(readxl)
library(keyATM)
library(tidyverse)
library(quanteda)
library(reshape2)
library(ggplot2)
library(stringr)
library(showtext)

# ----------------------------------------------------------
# Load data
# NOTE: Adjust the file paths below to match where you downloaded the data files.
# ----------------------------------------------------------
# News data
news_raw <- read_excel("PATH/TO/news_crawling.xlsx", col_names = FALSE)
text_data <- news_raw[["...5"]] |> (\(z) z[!is.na(z)])() |> str_squish()
doc_list  <- str_split(text_data, ",")
news_vec  <- vapply(doc_list, function(x) paste(x, collapse = " "), character(1))

# Candidate data 
lee_data <- read_csv("PATH/TO/Text_data_Lee.csv")
yoon_data <- read_csv("PATH/TO/Text_data_Yoon.csv")


# Preprocessing helpers
preprocess_text <- function(data) {
  data %>% mutate(clean_text = tolower(remarks)) %>% select(clean_text)
}
remove_one_char <- function(tokens_list) {
  tokens_select(tokens_list, pattern = "^[가-힣]{2,}$", valuetype = "regex", padding = FALSE)
}
preprocess_text_vec <- function(x, my_stopwords){
  corp <- corpus(x)
  toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) |>
    tokens_remove(pattern = my_stopwords) |>
    tokens_select(pattern = "^[가-힣]{2,}$", valuetype = "regex", padding = FALSE)
  dfm(toks)
}

# Stopwords (unchanged)
my_stopwords <- c(
  "이런","우리","저는","제가","지금","그런","하는","있는","때문에","그리고","그것","이제","합니다","그래서","즉","하지만","여러분",
  "말씀을","정말","이렇게","아까","하고","대해서","않습니까","것은","그런데","말씀","이미","그렇게","있습니다","많이","사실","너무",
  "하겠습니다","매우","번째","있는데","해서","한번","생각합니다","됩니다","우리가","겁니다","다른","저희가","말씀하신","것이","이게",
  "것처럼","하면","한다","드립니다","없는","정말로","저도","아니고","생각이","내가","통해서","먼저","가지고","된다","가장","위한",
  "어떻게","같습니다","되는","또는","듭니다","좋은","예를","그걸","가지","알겠습니다","드리고","해야","없습니다","계속","일단","제대로",
  "있다","그건","건지","여쭤보겠습니다","아니","사실은","맞습니다","전적으로","많은","드리고요","하지","같아요","결국","답을","식으로",
  "아마","구체적으로","이번","선택을","가지는","싶습니다","그러나","한다는","그때","저한테","그게","자체를","그때","같은","당연히",
  "있게","가능한","지금도","이해가","싶은","그러니까","특히","거죠","지금은","아닌","아니라","된다는","때는","있다고","만들고","그렇죠",
  "아주","거의","이유는","혹시","다시","바로","하게","드리는","중에","피해를","되는데","똑같은","확실하게","말씀드리겠습니다",
  "그거는","같아요","있으면","같은","그게","정도","이런","이제","다시","조금","이거","저희","일단","왜냐하면","그런데","그래서",
  "그러니까","이렇습니다","그렇습니다","있습니다","없습니다","합니다","합시다","했는데","됐습니다","있어요","없어요","어떤",
  "하셨는데","되면","후보께서","것을","얘기를","글쎄","보면","여러","그러면","거기에","아니겠습니까","대한","이거를","알고","있고",
  "것입니다","결국은","된다고","것이고","한다고","대해서는","보니까","이거는","거는","것도","나온","되고","하나","번째로","않고",
  "하는데","것이고","말씀드립니다","전에","있다는"
)

# Candidate text -> vectors
lee_vec  <- tolower(lee_data$remarks)
yoon_vec <- tolower(yoon_data$remarks)

# DFM (shared space)
dfm_news <- preprocess_text_vec(news_vec, my_stopwords)
dfm_lee  <- preprocess_text_vec(lee_vec,  my_stopwords)
dfm_yoon <- preprocess_text_vec(yoon_vec, my_stopwords)

dfm_news <- dfm_subset(dfm_news, ntoken(dfm_news) > 0)
dfm_lee  <- dfm_subset(dfm_lee,  ntoken(dfm_lee)  > 0)
dfm_yoon <- dfm_subset(dfm_yoon, ntoken(dfm_yoon) > 0)

docnames(dfm_news) <- paste0("News_", seq_len(ndoc(dfm_news)))
docnames(dfm_lee)  <- paste0("Lee_",  seq_len(ndoc(dfm_lee)))
docnames(dfm_yoon) <- paste0("Yoon_", seq_len(ndoc(dfm_yoon)))

dfm_cand <- rbind(dfm_lee, dfm_yoon)
docnames(dfm_cand) <- paste0("Cand_", seq_len(ndoc(dfm_cand)))
dfm_all  <- rbind(dfm_news, dfm_cand)

# Keywords (unchanged)
keywords <- list(
  economy = c("경제","기본소득","소득","국채","세금","증세","이자율","가계부채","구조조정","시장","재정","통화"),
  budget  = c("예산","재원","발행","지원금","재난지원금","손실보상법","불균형","현금"),
  gender  = c("여성","여자","페미니즘","성평등","성차별","젠더","하사","중사","다양성","성범죄"),
  security_foreign = c("안보","평화","전쟁","미사일","우크라이나","전술핵","북한","동맹","침공","나토","젤렌스키","러시아","외교","국방","한미일","방위산업","군사적"),
  welfare = c("청년","저출산","아이","일자리","지원","복지","교육","안전망","기본소득","장애인"),
  daejangdong = c("대장동","김만배","녹취록","수사","화천대유","도시개발공사","공소장","게이트"),
  electoral_reform = c("위성정당","연동형","선거법","대표","개헌","선거제도","개혁","합당"),
  party_politics   = c("정당","민주당","국민의힘","후보","단일화"),
  policy_general   = c("정책","정책에","정책으로","정책을","정책이","정책의","공약","공약을","공약으로")
)

filter_keywords <- function(dfm, kw){
  lapply(kw, function(ws) ws[ws %in% featnames(dfm)])
}
keywords_all <- filter_keywords(dfm_all, keywords)
keywords_all <- keywords_all[sapply(keywords_all, length) > 0]

# One KeyATM fit (keep seed/options)
set.seed(123)
docs_all <- keyATM_read(dfm_all)
mod_all  <- keyATM(
  docs = docs_all,
  no_keyword_topics = 0,
  keywords = keywords_all,
  model = "base",
  options = list(seed = 123, iterations = 100)
)
theta <- mod_all$theta
colnames(theta) <- names(keywords_all)


# Plot with English label replacement (keep behavior)
library(ggplot2)
library(grid)
library(gtable)
library(ggplotify)

ko2en <- c(
  "경제"="Economy","국채"="Government bonds","세금"="Tax","증세"="Tax increase","이자율"="Interest rate",
  "가계부채"="Household debt","구조조정"="Restructuring","시장"="Market","재정"="Fiscal","통화"="Currency",
  "예산"="Budget","재원"="Fiscal resources","발행"="Issuance","지원금"="Subsidy","보상"="Compensation","현금"="Cash",
  "여성"="Women","여자"="Female","페미니즘"="Feminism","성평등"="Gender equality","성차별"="Gender discrimination",
  "젠더"="Gender","성범죄"="Sex crime","다양성"="Diversity","남성"="Men","사람"="People","구조"="Structure","생각"="Thought",
  "안보"="Security","평화"="Peace","전쟁"="War","미사일"="Missile","우크라이나"="Ukraine","러시아"="Russia","침공"="Invasion",
  "동맹"="Alliance","국방"="National defense","발언"="Remark","군사적"="Military",
  "청년"="Youth","저출산"="Low fertility","아이"="Child","일자리"="Jobs","지원"="Support","복지"="Welfare","교육"="Education",
  "안전망"="Safety net","장애인"="Disabled person","정책"="Policy","공약"="Pledge","투표"="Vote","유권자"="Voter",
  "정치"="Politics","대통령"="President","국민"="Nation","국가"="State","정부"="Government","정당"="Party","민주당"="Democratic Party",
  "국민의힘"="People Power Party","후보"="Candidate","단일화"="Candidate consolidation","대선"="Presidential election","토론"="Debate",
  "대표"="Represent","대장동"="Daejang-dong","녹취록"="Recording transcript","수사"="Investigation","의혹"="Allegation",
  "검찰"="Prosecutors","이재명"="Lee","김만배"="Kim Man-bae","화천대유"="Hwacheon Daeyu","도시개발공사"="Urban Development Corporation",
  "공소장"="Indictment","게이트"="Scandal","윤석열"="Yoon","위성정당"="Satellite party","연동형"="Compensatory PR system",
  "선거법"="Election law","개헌"="Constitutional amendment","선거제도"="Electoral system","개혁"="Reform","합당"="Party merger",
  "선거"="Election","코로나"="COVID-19"
)

`%||%` <- function(a, b) if (!is.null(a)) a else b
extract_ggplot <- function(x) {
  if (inherits(x, "ggplot")) return(x)
  if (is.list(x)) {
    cand <- x$plot %||% x$figure %||% x$ggplot %||% x$g
    if (inherits(cand, "ggplot")) return(cand)
    idx <- which(vapply(x, inherits, logical(1), "ggplot"))
    if (length(idx) >= 1) return(x[[idx[1]]])
  }
  stop("No ggplot object found inside `plot_topicprop()` result.")
}

showtext_auto()
p <- plot_topicprop(mod_all, n = 6)

if (!is.null(p$data$label)) p$data$label <- str_replace_all(p$data$label, ko2en)

p_gg <- extract_ggplot(p)
g    <- ggplotGrob(p_gg)

translate_text_grob <- function(grob) {
  if ("text" %in% class(grob) && !is.null(grob$label)) {
    grob$label <- str_replace_all(grob$label, ko2en)
  }
  if (!is.null(grob$children)) {
    for (j in seq_along(grob$children)) grob$children[[j]] <- translate_text_grob(grob$children[[j]])
  }
  grob
}
for (i in seq_along(g$grobs)) g$grobs[[i]] <- translate_text_grob(g$grobs[[i]])

p_out <- ggplotify::as.ggplot(g)
ggsave("keyatm_prop_en.pdf", plot = p_out, width = 10, height = 6, device = cairo_pdf)
print(p)

# Additional styling (kept to preserve original downstream appearance)
library(scales)

p_gg <- p_gg +
  coord_cartesian(clip = "off") +
  theme(
    panel.border = element_blank(),
    plot.margin  = margin(12, 80, 12, 12),
    axis.text.x  = element_text(size = 12),
    axis.text.y  = element_text(size = 13),
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14)
  ) +
  scale_x_continuous(
    name   = "Expected topic proportions",
    labels = percent_format(accuracy = 1),
    breaks = seq(0, 0.6, by = 0.2),
    limits = c(0, 0.6),
    expand = expansion(mult = c(0, 0.02))
  )

g <- ggplotGrob(p_gg)
g$layout$clip[g$layout$name == "panel"] <- "off"
g <- gtable::gtable_add_cols(g, grid::unit(3, "cm"), pos = ncol(g))

translate_text_grob <- function(grob) {
  if ("text" %in% class(grob) && !is.null(grob$label)) {
    grob$label <- str_replace_all(grob$label, ko2en)
    if (any(grepl(",", grob$label))) grob$gp$fontsize <- 10
  }
  if (!is.null(grob$children)) {
    for (j in seq_along(grob$children)) grob$children[[j]] <- translate_text_grob(grob$children[[j]])
  }
  grob
}
for (i in seq_along(g$grobs)) g$grobs[[i]] <- translate_text_grob(g$grobs[[i]])

p_out <- ggplotify::as.ggplot(g)
p_out
